libraries

library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(cowplot)  
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)
library(viridisLite)

theme_set(theme_classic())


data_dir = "/home/zli4/MSK_DEC24/Analysis"

out_dir = "/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/anova"

load data

integrated = readRDS(file.path(data_dir,"integrated-rpca_BDF-ref_withMeta.RDS"))
# remove cells without cell type annotations
keep.cells= rownames(integrated@meta.data)[
  !is.na(integrated@meta.data$celltype)
]

integrated= subset(
  integrated,
  cells = keep.cells
)
integrated$celltype = as.factor(integrated$celltype)
summary(integrated$celltype)
##                  DE              Ductal         Endothelial                 EnP 
##               16283               18880                1114               19845 
##                 ESC              ESC_DE               Liver Mesenchymal_Stromal 
##               15280                4559                9363                1698 
##                 PFG                 PGT                  PP            SC-alpha 
##                8440                2627               14249                9684 
##             SC-beta            SC-delta               SC-EC             Stromal 
##               10152                1856               13743                3944

order cell types

celltype_orders = c(
  "ESC",
  "ESC_DE",
  "DE",
  "PFG",
  "PGT",
  "PP",
  "EnP",
  "SC-EC",
  "SC-alpha",
  "SC-beta",
  "SC-delta",
  "Ductal",
  "Mesenchymal_Stromal",
  "Stromal",
  "Endothelial",
  "Liver"
)

integrated$celltype = factor(integrated$celltype,
                              levels =celltype_orders ,
                              ordered = TRUE)

celltype_palette = readRDS("/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")

split seurat object by time points

obj = integrated%>%subset(subset = time_point%in%c("D-1","D3","D7","D11","D18"))

obj$time_point = droplevels(obj$time_point)
summary(obj$time_point)
##   D-1    D3    D7   D11   D18 
## 15775 18602 16943 12052 36357
obj_list = SplitObject(obj, split.by = "time_point")
obj_list
## $D18
## An object of class Seurat 
## 36390 features across 36357 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D7
## An object of class Seurat 
## 36390 features across 16943 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D3
## An object of class Seurat 
## 36390 features across 18602 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D11
## An object of class Seurat 
## 36390 features across 12052 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $`D-1`
## An object of class Seurat 
## 36390 features across 15775 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap

create pseudobulk samples

f_pseudobulk_list = file.path(out_dir,"pseudobulk_list.RDS")

if(file.exists(f_pseudobulk_list)){
  pseudo_obj_list  = readRDS(f_pseudobulk_list)
}else{
  pseudo_obj_list = list()
  for(tp in c("D-1","D3","D7","D11", "D18")){
    obj = obj_list[[tp]]
  
    obj$celltype = as.factor(obj$celltype)
    obj$background = as.factor(obj$background)
    obj$genotype = as.factor(obj$genotype)
    levels(obj$celltype)=gsub("_","-",levels(obj$celltype)) # rename celltype levels
    pseudo_obj = AggregateExpression(obj, assays = "RNA", return.seurat = T,
                                      group.by = c("sample_name","feature.4", "celltype"))
    
    pseudo_obj$sample_name = gsub("-","_",pseudo_obj$sample_name)
    
    # process metadata
    pseudo_meta = pseudo_obj[[]]
    pseudo_meta$pseudo.sample.id = pseudo_meta$orig.ident
    obj$pseudo.sample.id = as.factor(paste(obj$sample_name,obj$feature.4,obj$celltype,sep='_'))
    obj$pseudo.sample.id = gsub("-","_", obj$pseudo.sample.id)
    pseudo_meta$pseudo.sample.id = gsub("-","_",rownames(pseudo_meta))
    pseudo_meta = pseudo_meta %>%
                            left_join(obj[[]]%>%select(pseudo.sample.id,genotype,source,background)%>%distinct(), 
                                           by = c("pseudo.sample.id"))
    pseudo_meta$celltype = as.factor(pseudo_meta$celltype)
    pseudo_meta$genotype = as.factor(pseudo_meta$genotype)
  
    n_df=obj[[]]%>%dplyr::count(pseudo.sample.id) 
    pseudo_meta = pseudo_meta%>%left_join(n_df)
    rownames(pseudo_meta) = pseudo_meta$orig.ident
    pseudo_obj@meta.data = pseudo_meta
    
    pseudo_obj$celltype = as.character( pseudo_obj$celltype)
    pseudo_obj$celltype = ifelse(pseudo_obj$celltype == 'ESC-DE','ESC_DE',
                                 ifelse(pseudo_obj$celltype == 'Mesenchymal-Stromal','Mesenchymal_Stromal',pseudo_obj$celltype ))
    pseudo_obj$celltype = as.factor(pseudo_obj$celltype)
    pseudo_obj$genotype = as.factor(pseudo_obj$genotype)
    pseudo_obj_list[[tp]] = pseudo_obj
  }
  names(pseudo_obj_list) = c("D-1","D3","D7","D11", "D18")
  saveRDS(pseudo_obj_list,f_pseudobulk_list)
}
pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
  pseudo_obj$celltype = factor(pseudo_obj$celltype,
                              levels =celltype_orders ,
                              ordered = TRUE)
  pseudo_obj
})

remove pseudobulk samples with < 30 cells

min_cells = 30


pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
  pseudo_obj%>%subset(subset = n>=min_cells)
})
pseudo_obj_list
## $`D-1`
## An object of class Seurat 
## 36390 features across 93 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D3
## An object of class Seurat 
## 36390 features across 129 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D7
## An object of class Seurat 
## 36390 features across 136 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D11
## An object of class Seurat 
## 36390 features across 92 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D18
## An object of class Seurat 
## 36390 features across 204 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data

feature selection for PCA

for each time point, use top 5000 highly variable genes (HVGs)

pca_features = list()
for(tp in c("D-1","D3","D7","D11","D18")){
  pseudo_obj = pseudo_obj_list[[tp]]

  pseudo_obj = FindVariableFeatures(pseudo_obj,nfeatures = 5000)
  pca_features[[tp]] = VariableFeatures(pseudo_obj)
  pseudo_obj = ScaleData(pseudo_obj,verbose = F)
  pseudo_obj=RunPCA(pseudo_obj,npcs = min(50,ncol(pseudo_obj)-1),verbose = F)

  pve <- (pseudo_obj[["pca"]]@stdev)^2            # eigen-values (= variance of each PC)
  pve <- pve / sum(pve)                           # proportion of total variance
  pc_labs = function(i) sprintf("PC%d (%.1f%%)", i, 100 * pve[i])

  df = as.data.frame(cbind(Embeddings(pseudo_obj, reduction = "pca")[,1:2],pseudo_obj[[]]))
  df$celltype = factor(df$celltype,
                              levels =celltype_orders ,
                              ordered = TRUE)
  df$celltype = droplevels(df$celltype)

  # set shapes
  shape_values <- c(16, 17, 15, 3, 7, 8, 18, 0, 1, 2)
  shape_values = shape_values[1:length(unique(df$celltype))]
  names(shape_values) <- unique(df$celltype)
  
  fig1 <- ggplot(df, aes(x     = PC_1,
                        y     = PC_2,
                        color = celltype,
                        shape = celltype)) +
    geom_point(size  = 1,
               alpha = 0.8) +                 # 80% opacity
    scale_shape_manual(values = shape_values) +  # assign each celltype its own shape
    labs(
      title = "",
      x     = pc_labs(1),
      y     = pc_labs(2),
      color = "cell type",
      shape = "cell type"
    ) +
    scale_color_manual(
      values = celltype_palette,   
      drop   = TRUE                
    )+
    theme_classic()+
    guides(
      color = guide_legend(
        override.aes = list(size = 4)  # Increase legend point size
      )
    )


  fig2 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = n))+
    geom_point(size=1)+
    viridis::scale_color_viridis()+
    labs(
      title = "",
      x     = pc_labs(1),
      y     = pc_labs(2),
     color  = "# cells"
    )+
    theme_classic()
  
  fig3 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = background))+
    geom_point(size=1)+
    labs(
      title = "",
      x     = pc_labs(1),
      y     = pc_labs(2),
    )+
    theme_classic()+
    guides(
      color = guide_legend(
        override.aes = list(size = 4)  # Increase legend point size
      )
    )
  fig4 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = source))+
    geom_point(size=1)+
    labs(
      title = "",
      x     = pc_labs(1),
      y     = pc_labs(2),
    )+
    theme_classic()+
    guides(
      color = guide_legend(
        override.aes = list(size = 4)  # Increase legend point size
      )
    )
  fig5 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = genotype,shape=background,label=feature.4))+
    geom_point(size=1)+
    ggrepel::geom_text_repel(aes(label = feature.4),size =2) +
    labs(
      title = "",
      x     = pc_labs(1),
      y     = pc_labs(2),
    )+
    theme_classic()+
    theme(legend.position = "none")
  
  fig = gridExtra::grid.arrange(
        fig1, fig2, fig3, fig4, fig5,
        layout_matrix = rbind(
          c(1, 2),   # row-1
          c(3, 4),   # row-2
          c(5, 5)    # row-3  ← fig5 spans both columns
        ),
        heights = c(1, 1, 2)   # give fig5 double the row-height
      )

  ggsave(file.path(out_dir,paste0("pc_plots/",tp,".png",sep='')),fig,width = 10,height = 12)
}

if(file.exists( file.path(out_dir,"pca_features.RDS"))){
  pca_features = readRDS(file.path(out_dir,"pca_features.RDS"))
}else{
  saveRDS(pca_features,file.path(out_dir,"pca_features.RDS"))
}

Analysis

for(tp in c("D-1","D3","D7","D11","D18")){
    obj = pseudo_obj_list[[tp]]
    # filter genotypes
    obj$genotype = as.factor(obj$genotype)
    tmp = obj[[]]%>%select(genotype,feature.4)%>%distinct()
    obj = obj%>%subset(subset = genotype%in%names(which(summary(tmp$genotype)>=2)))
    obj$genotype = droplevels(obj$genotype)
    pseudo_obj_list[[tp]] = obj
}
for(tp in c("D-1","D3","D7","D11","D18")){
  if(!file.exists(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))){
    features = pca_features[[tp]] # time-point specific HVGs
    print(length(features))
    print(tp)
    obj = pseudo_obj_list[[tp]]
    
    # filter genotypes
    obj$genotype = as.factor(obj$genotype)
    tmp = obj[[]]%>%select(genotype,feature.4)%>%distinct()
    obj = obj%>%subset(subset = genotype%in%names(which(summary(tmp$genotype)>=2)))
    obj$genotype = droplevels(obj$genotype)
    
    meta = obj[[]]%>%dplyr::select(source,background,celltype,genotype,feature.4)
    meta$celltype = as.factor(meta$celltype)
    dat = GetAssayData(obj,layer='data')[features,]
    counts = GetAssayData(obj,layer='counts')[features,]
    rd=colSums(counts)
    meta$rd = rd
    
    meta$genotype = as.factor( meta$genotype) # include genotypes with only 1 clones
    meta$sample = rownames(meta)
    summary( meta$genotype)
    
    meta$genotype = relevel( meta$genotype,ref="WT")
    genotype_mtx =model.matrix(~ genotype,  meta) # craete dummy variables
    colnames(genotype_mtx)[1] = "genotype_ref"
    genotype_mtx = as.data.frame(genotype_mtx)
    genotype_mtx$sample = rownames(genotype_mtx)
    
    meta = meta%>%left_join(genotype_mtx,by = join_by(sample))
    
    if(tp=="D-1"){ # handle D-1 data
      df= parallel::mclapply(features,function(g){
        y_df = meta; y_df$y = dat[g,]
        mod0 = lm(y~rd,data=y_df)
        mod1 = lm(y~rd+source,data=y_df)
        mod2 = lm(y~rd+source+background,data=y_df)
        mod3 = mod2
        r2 = c(summary(mod0)$r.squared,
               summary(mod1)$r.squared-summary(mod0)$r.squared, # data source
               summary(mod2)$r.squared-summary(mod1)$r.squared, # cell background
               NA)
        names(r2) = c("read-depth","data-source","cell-background","cell-type")
        
        genotype_KO = setdiff(levels(meta$genotype),"WT")
        genotype_r2 = rep(NA,length(genotype_KO));names(genotype_r2) = genotype_KO
        genotype_pval = rep(NA,length(genotype_KO));names(genotype_pval) = paste("pval",genotype_KO,sep="_")
        for(geno in genotype_KO){
          # full model
          df0 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-celltype) 
          mod_full = lm(y~.,data = df0)
          v = as.symbol(paste0("genotype",geno))
          # remove genotype of interest
          df1 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-paste0("genotype",geno),-celltype) 
          mod4 = lm(y~.,data = df1)
          genotype_r2[geno] = summary(mod_full)$r.squared-summary(mod4)$r.squared
          genotype_pval[paste("pval",geno,sep="_")] = summary(mod_full)$coefficients[setdiff( names(mod_full$coefficients), names(mod4$coefficients)),"Pr(>|t|)"]
        }
        res = c(r2,genotype_r2,genotype_pval)
        return(res)
      },mc.cores = 20)
    }else{
      tictoc::tic()
      df= parallel::mclapply(features,function(g){
        y_df = meta; y_df$y = dat[g,]
        mod0 = lm(y~rd,data=y_df)
        mod1 = lm(y~rd+source,data=y_df)
        mod2 = lm(y~rd+source+background,data=y_df)
        mod3 = lm(y~rd+source+background+celltype,data=y_df)
        r2 = c(summary(mod0)$r.squared,
               summary(mod1)$r.squared-summary(mod0)$r.squared, # data source
               summary(mod2)$r.squared-summary(mod1)$r.squared, # cell background
               ifelse(summary(mod3)$r.squared-summary(mod2)$r.squared==0,NA,summary(mod3)$r.squared-summary(mod2)$r.squared))
        names(r2) = c("read-depth","data-source","cell-background","cell-type")
        
        genotype_KO = setdiff(levels(meta$genotype),"WT")
        genotype_r2 = rep(NA,length(genotype_KO));names(genotype_r2) = genotype_KO
        genotype_pval = rep(NA,length(genotype_KO));names(genotype_pval) = paste("pval",genotype_KO,sep="_")
        for(geno in genotype_KO){
          # full model
          df0 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample) 
          mod_full = lm(y~.,data = df0)
          v = as.symbol(paste0("genotype",geno))
          # remove genotype of interest
          df1 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-paste0("genotype",geno)) 
          mod4 = lm(y~.,data = df1)
          genotype_r2[geno] = summary(mod_full)$r.squared-summary(mod4)$r.squared
          genotype_pval[paste("pval",geno,sep="_")] = summary(mod_full)$coefficients[setdiff( names(mod_full$coefficients), names(mod4$coefficients)),"Pr(>|t|)"]
        }
        res = c(r2,genotype_r2,genotype_pval)
        return(res)
      },mc.cores = 20)
      tictoc::toc()
    }
    df=do.call("rbind", df)
    rownames(df) = features
    saveRDS(df,file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))
  }
}

Heatmap of proportion variance explained for time-point-specific HVGs

only time-point specific HVGs with <75% percent zeros across cells are considered

#pdf(paste(out_dir,"htmap.pdf",sep=''))
for(tp in c("D-1","D3","D7","D11","D18")){
  features_tp = pca_features[[tp]]
  pseudo_obj = pseudo_obj_list[[tp]]
  dat = GetAssayData(pseudo_obj)[features_tp,]
  zero_prop = apply(dat,1,function(x){
    sum(x==0)/length(x)
  })
  features_tp = features_tp[which(zero_prop<quantile(zero_prop,0.75))]

  lm_res = readRDS(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))
  if(tp=="D-1"){
    mtx=lm_res[,-c(4,which(startsWith(colnames(lm_res), "pval_")))]
  }else{
    mtx =lm_res[,-c(which(startsWith(colnames(lm_res), "pval_")))]
  }
  mtx = mtx[features_tp,] # keep on tp-specific genes
  if(sum(is.na(rowSums(mtx)))!=0){
      mtx=mtx[-which(is.na(rowSums(mtx))),]
  }
  pvals = lm_res[,which(startsWith(colnames(lm_res), "pval_"))]
  pvals=pvals[rownames(mtx),]
  genotypes = sub(".*_", "", colnames(pvals))
  for(g in rownames(mtx)){ # keep only r2 corresponding to significant coefficients
    for(genotype in genotypes){
      p = p.adjust(pvals[g,paste("pval",genotype,sep="_")], method = "fdr")
      mtx[g,genotype] = ifelse(p<0.05,mtx[g,genotype],0)
    }
  }
  print(dim(mtx))
  
   mtx = log10(mtx+10^{-12})
   
   
   mtx0 = mtx
   mtx0[which(mtx0<(-4))] = -4 # truncate
   mtx0_sub = mtx0[,-(1:4)] # genotypes only
   breaks = c(seq(min(mtx0),-1,length.out=30),seq(-1,max(mtx0),length.out=50)[-1])
   breaks1 = c(seq(min(mtx0_sub),-1,length.out=30),seq(-1,max(mtx0_sub),length.out=50)[-1])
   htmap = pheatmap::pheatmap(mtx0,main = paste(tp, ": log10(proportion var. explained)",sep=""),cluster_cols = F,scale = 'none',show_rownames=F,color=plasma(length(breaks) - 1),clustering_method = "ward.D")
    print(htmap)
  print(pheatmap::pheatmap(mtx0_sub,main = paste(tp, ": log10(proportion var. explained): genotypes",sep=""),scale = 'none',show_rownames=F,color=plasma(length(breaks1) - 1),clustering_method = "ward.D"))
}
## [1] 3746   29

## [1] 3749   29

## [1] 3750   19

## [1] 3691   19

## [1] 3745   21

check the numbers of pseudobulk samples for each cell type

for(tp in c("D-1","D3","D7","D11","D18")){
  pseudo_obj = pseudo_obj_list[[tp]]
  pseudo_obj$celltype = droplevels(pseudo_obj$celltype)
  print(summary(pseudo_obj$celltype))
}
##    ESC ESC_DE 
##     71     12 
##    ESC ESC_DE     DE 
##      7     31     76 
##          DE         PFG     Stromal Endothelial       Liver 
##           3          46           6           4          50 
##         PFG          PP         EnP     Stromal Endothelial       Liver 
##           4          48          14           6           3           1 
##                  PP                 EnP               SC-EC            SC-alpha 
##                   3                  25                  25                  24 
##             SC-beta            SC-delta              Ductal Mesenchymal_Stromal 
##                  16                   8                  43                   1 
##             Stromal               Liver 
##                  12                   4

aggregate results from all time points in to a long-format dataframe in long format

mtx_lst = list()

for(tp in c("D-1","D3","D7","D11","D18")){

  features_tp = pca_features[[tp]]
  pseudo_obj = pseudo_obj_list[[tp]]
  dat = GetAssayData(pseudo_obj)[features_tp,]
  zero_prop = apply(dat,1,function(x){
    sum(x==0)/length(x)
  }) 
  features_tp = features_tp[which(zero_prop<quantile(zero_prop,0.75))]
  lm_res = readRDS(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))

  if(tp=="D-1"){
    mtx=lm_res[,-c(4,which(startsWith(colnames(lm_res), "pval_")))]
  }else{
    mtx =lm_res[,-c(which(startsWith(colnames(lm_res), "pval_")))]
  }
  mtx = mtx[features_tp,] # keep on tp-specific genes
  if(sum(is.na(rowSums(mtx)))!=0){
      mtx=mtx[-which(is.na(rowSums(mtx))),]
  }
  pvals = lm_res[,which(startsWith(colnames(lm_res), "pval_"))]
  pvals=pvals[rownames(mtx),]
  genotypes = sub(".*_", "", colnames(pvals))
  
  print(dim(mtx))
  
   mtx = log10(mtx+10^{-12}) 
   
  # for each gene, adjust genotype p-valus by FDR
  # keep only r2 corresponding to significant coefficients
  for(g in rownames(mtx)){ 
    for(genotype in genotypes){
      p = p.adjust(pvals[g,paste("pval",genotype,sep="_")], method = "fdr")
      mtx[g,genotype] = ifelse(p<0.05,mtx[g,genotype],-Inf)
    }
  }
    
   df = as.data.frame(mtx)
   df$time = tp
   df$gene = rownames(df)
   mtx_lst[[tp]] = df
}
## [1] 3746   29
## [1] 3749   29
## [1] 3750   19
## [1] 3691   19
## [1] 3745   21
# enforce all matrices have the same columns
names = unique(unlist(sapply(mtx_lst,function(df){
  colnames(df)
})))
mtx_lst = lapply(mtx_lst,function(df){
  df[,setdiff(names,colnames(df))] = NA
  return(df[,names])
})

df = do.call(rbind,mtx_lst)

df_long = reshape2::melt(df, id.vars = c("time","gene"), variable.name = "Factor", value.name = "Value")
df_long$time = factor(df_long$time,levels = c("D-1","D3","D7","D11","D18"))

df_long[1:2,]
##   time   gene     Factor      Value
## 1  D-1 CHCHD2 read-depth -0.7030275
## 2  D-1   CHGA read-depth -0.2560885

Check the number of cell types that occur in at least two genotypes at each time point

for(tp in c("D3","D7","D11","D18")){
  
  print(tp)
  pseudo_obj = pseudo_obj_list[[tp]]
  
  # remove cell types that only occur in one genotype
  df= pseudo_obj[[]]
  idx = (df%>%
           group_by(celltype)%>%
           summarise(n_distinct(genotype)))[,2]>=1 
  celltypes = (df%>%group_by(celltype)%>%summarise(n_distinct(genotype)))[,1][idx]
  pseudo_obj = pseudo_obj%>%subset(subset = celltype%in%celltypes)
  pseudo_obj$celltype = droplevels(pseudo_obj$celltype)
  print(levels(pseudo_obj$celltype))
}
## [1] "D3"
## [1] "ESC"    "ESC_DE" "DE"    
## [1] "D7"
## [1] "DE"          "PFG"         "Stromal"     "Endothelial" "Liver"      
## [1] "D11"
## [1] "PFG"         "PP"          "EnP"         "Stromal"     "Endothelial"
## [6] "Liver"      
## [1] "D18"
##  [1] "PP"                  "EnP"                 "SC-EC"              
##  [4] "SC-alpha"            "SC-beta"             "SC-delta"           
##  [7] "Ductal"              "Mesenchymal_Stromal" "Stromal"            
## [10] "Liver"

degree-of-freedom adjusted proportion of variance attributed to cell types

Proportion variance explained by cell type is adjusted by degree of freedom (number of cell types -1) in the panel “cell-type-scaled”.

df_long_scaled = df_long
df_long_scaled$Value <- ifelse(df_long$time =="D3" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(2), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D7" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(4), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D11" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(5), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D18" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(9), df_long_scaled$Value)
df_long_scaled = df_long_scaled%>%filter(Factor=='cell-type')
df_long_scaled$Factor = 'cell-type-scaled'

unique(df_long$Factor)
##  [1] read-depth      data-source     cell-background BMPR1A         
##  [5] FOXA1           FOXA2           GATA4           GATA4het       
##  [9] GATA6           GATA6het        GLIS3           GSC            
## [13] HHEX            HHEXe           MNX1            NANOGehet      
## [17] NEUROD1         NGN3            NKX2-2          ONECUT1e       
## [21] OTUD5           PAX6            PBX1            PDX1           
## [25] QSER1           QSER1TET1       RFX6            TET1/2/3       
## [29] TLE3            cell-type      
## 30 Levels: read-depth data-source cell-background BMPR1A FOXA1 FOXA2 ... cell-type

Violin Plots summarizing proportions of variance explained by relavant factors at each time point

df_long = rbind(df_long,df_long_scaled)
fig1_scaled=df_long%>%filter(Factor%in%c("read-depth", "data-source",
                                         "cell-background","cell-type","cell-type-scaled"))%>%
  ggplot(aes(x = time,y =Value,color = time))+
  geom_violin()+
  geom_boxplot(width = 0.1, fill = "white", aes(color = time),outlier.size = 0.5) +  # Boxplot
  ylab("log10(proportion \n variance explained)")+
  xlab("")+
  theme_bw()+
   scale_color_brewer(palette = "Set1")+
  guides(color="none")+
  facet_wrap(~Factor)
fig1_scaled
## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave(
  filename = file.path(out_dir,"propVar_summary.png"),
  plot     = fig1_scaled,
  width    = 6,
  height   = 3,
  units    = "in",
  dpi      = 300
)
## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 7492 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

truncate y-axis at -4

Same as above, except for truncating y-axis at -4

fig1_scaled_truncated = df_long%>%
  mutate(Value =  ifelse(df_long$Value<(-4),NA,df_long$Value))%>%
  filter(Factor%in%c("read-depth","data-source","cell-background","cell-type","cell-type-scaled"))%>%
  ggplot(aes(x = time,y =Value,color = time))+
  geom_violin()+
  geom_boxplot(width = 0.1, fill = "white", aes(color = time),outlier.size = 0.5) +  # Boxplot
  ylab("log10(proportion \n variance explained)")+
  xlab("")+
  theme_bw()+
   scale_color_brewer(palette = "Set1")+
  guides(color="none")+
  facet_wrap(~Factor)
fig1_scaled_truncated
## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave(
  filename = file.path(out_dir,"propVar_summary_truncated.png"),
  plot     = fig1_scaled_truncated,
  width    = 6,
  height   = 3,
  units    = "in",
  dpi      = 300
)
## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 9692 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Violin plots for proportion variance attributed to genotype

Only significant r-squares are kept for visualization.

Numbers on top indicates the number of genotypes incorporated in the analysis at each time point

# subset data frame for genotype: include only genes with FDR<0.05
df_long1 = df_long%>%
  filter(!Factor%in%c("read-depth","data-source","cell-background","cell-type","cell-type-scaled"))
  #mutate(Value =  ifelse(df_long1$Value<(-4),NA,df_long1$Value))
df_long1$id = paste(df_long1$time,df_long1$Factor,sep="_")
write.csv(df_long1,file.path(out_dir,"genotype_significant_propvar.csv"),row.names = FALSE)
# summarize the number of genotypes involved in the analysis for each day
ann = df_long1 %>%
  filter(!is.infinite(Value))%>%
  group_by(time, Factor) %>%
  summarise(
    median = median(Value, na.rm = TRUE)
  )%>%
  ungroup()%>%
  group_by(time)%>%
  summarise(num_genotype = sum(!is.na(median)))
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
fig2=df_long1 %>%
  filter(!is.infinite(Value))%>%
  group_by(time, Factor) %>%
  summarise(
    median = median(Value,na.rm=T)
  )%>%ggplot(aes(x=time,y=median,color = time))+
  geom_violin()+
  geom_boxplot(width = 0.1, aes(color = time)) +  # Boxplot within violin, without outliers
   geom_text(data =ann, 
  aes(x = time, y =-1, label = num_genotype), 
            size = 3, color = "black")+
  ylab( "medians of \n log10(proportion \n variance explained)") +
  xlab("")+
  ggtitle("all available genotypes")+
    scale_color_brewer(palette = "Set1")+
  guides(color = "none")+
  theme_bw()
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
fig2
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave(
  filename = file.path(out_dir,"genotype_median_summary.png"),
  plot     = fig2,
  width    = 6,
  height   = 3,
  units    = "in",
  dpi      = 300
)
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 32 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

proportion explained by individual genotypes

compute the proportion of genes with significant proportion variance explained by each genotype at each time point

significant_prop <- df_long1 %>%
  group_by(time, Factor) %>%
  summarise(
    total = n(),
    value_count = sum((!is.infinite(Value))),
    prop= value_count / total
  )
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
# prop = 1 only happens when the genotype is not included in the analysis (didn't pass filtering criteria)
significant_prop = significant_prop%>%filter(prop!=1)
significant_prop$id = paste(significant_prop$time,significant_prop$Factor,sep="_")

time-point specific violin plots for each genotype

Plot significant proportion variance explained by each genotype at each time point with the proportion of genes with significant proportion variance explained by the genotype

summary((df_long1%>%
 filter(!is.infinite(Value)))$Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -4.35   -1.80   -1.55   -1.60   -1.35   -0.28  119305
value_range <- range((df_long1%>%
  filter(!is.infinite(Value)))$Value, na.rm = TRUE)

# Find the range of the proportion of non-NA values
prop_range <- c(0,0.3) #range(significant_prop$prop[significant_prop$prop!=1], na.rm = TRUE)

# Create the plot
fig3=df_long1%>%
  filter(!is.infinite(Value))%>%
  ggplot(aes(x = time, y = Value)) +
  geom_violin(trim = T, alpha = 0.5) +  # Violin plot
  geom_boxplot(width = 0.1,outlier.size = 0.5) +  # Boxplot within violin, without outliers
  geom_point(data = significant_prop%>%filter(prop!=1), 
             aes(x = time, y = (prop - min(prop_range)) / diff(prop_range) * diff(value_range) + min(value_range)),
             color = 'maroon',shape=8,size = 1, position = position_dodge(width = 0.9)) +  # Points for proportion of non-NA values scaled to the primary y-axis
  scale_y_continuous(
    sec.axis = sec_axis(~ (. - min(value_range)) / diff(value_range) * diff(prop_range) + min(prop_range), name = "Proportion of genes with FDR<0.05")
  ) +
  theme_bw() +
  labs(x = "",
       y = "log10(proportion variance explained)") +
  facet_wrap(~Factor)
fig3
## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave(
  filename = file.path(out_dir,"genotype_timepoint_propVar.png"),
  plot     = fig3,
  width    = 10,
  height   = 10,
  units    = "in",
  dpi      = 300
)
## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 119305 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

proportion of genes with significant proportion variance explained

Missing point means that at that time point the genotype is not tested due to the lack of sufficient pseudobulk samples

fig4 = significant_prop%>%
  filter(prop!=1)%>%
  ggplot(aes(x=time,y=prop,group = Factor,color = Factor))+
  geom_point()+
  geom_line()+
  ylab("proportion of genes with FDR<0.05")+
  theme_bw()+
   theme(legend.position = "none")+
  facet_wrap(~Factor)
fig4
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

ggsave(
  filename = file.path(out_dir,"genotype_proportionSignificant.png"),
  plot     = fig4,
  width    = 10,
  height   = 8,
  units    = "in",
  dpi      = 300
)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

summarize across genotypes

fig5 = significant_prop%>%
  filter(prop!=1)%>%
  ggplot(aes(x = time,y=prop,color = time))+
   geom_violin(trim = T, alpha = 0.5) +  # Violin plot
  geom_boxplot(width = 0.1,outlier.size = 0.5,aes(color = time)) + 
  xlab("")+
  ylab("median proportion of genes with FDR<0.05")+
   guides(color="none")+
  theme_bw()
fig5

ggsave(
  filename = file.path(out_dir,"celltype_prop_time.png"),
  plot     = fig5,
  width    = 8,
  height   = 4,
  units    = "in",
  dpi      = 300
)

Session information

gc()
##              used    (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells    8459573   451.8   14173694   757.0   14173694   757.0
## Vcells 3288123190 25086.4 6254144870 47715.4 6139780971 46842.9
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] viridisLite_0.4.2           lubridate_1.9.3            
##  [3] forcats_1.0.0               purrr_1.0.2                
##  [5] readr_2.1.5                 tidyverse_2.0.0            
##  [7] ggpubr_0.6.0                ggrepel_0.9.5              
##  [9] gridExtra_2.3               DESeq2_1.44.0              
## [11] SummarizedExperiment_1.34.0 Biobase_2.64.0             
## [13] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [15] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [17] IRanges_2.38.0              S4Vectors_0.42.0           
## [19] BiocGenerics_0.50.0         readxl_1.4.3               
## [21] reshape2_1.4.4              pheatmap_1.0.12            
## [23] stringr_1.5.1               patchwork_1.2.0            
## [25] cowplot_1.1.3               tibble_3.2.1               
## [27] ggplot2_3.5.1               tidyr_1.3.1                
## [29] dplyr_1.1.4                 Matrix_1.7-0               
## [31] Seurat_5.1.0                SeuratObject_5.0.2         
## [33] sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.0           later_1.3.2            
##   [4] cellranger_1.1.0        polyclip_1.10-6         fastDummies_1.7.3      
##   [7] lifecycle_1.0.4         rstatix_0.7.2           globals_0.16.3         
##  [10] lattice_0.22-6          MASS_7.3-60.2           backports_1.4.1        
##  [13] magrittr_2.0.3          plotly_4.10.4           sass_0.4.9             
##  [16] rmarkdown_2.26          jquerylib_0.1.4         yaml_2.3.8             
##  [19] httpuv_1.6.15           sctransform_0.4.1       spam_2.10-0            
##  [22] spatstat.sparse_3.0-3   reticulate_1.36.1       pbapply_1.7-2          
##  [25] RColorBrewer_1.1-3      abind_1.4-5             zlibbioc_1.50.0        
##  [28] Rtsne_0.17              GenomeInfoDbData_1.2.12 irlba_2.3.5.1          
##  [31] listenv_0.9.1           spatstat.utils_3.0-4    goftest_1.2-3          
##  [34] RSpectra_0.16-1         spatstat.random_3.2-3   fitdistrplus_1.2-1     
##  [37] parallelly_1.37.1       leiden_0.4.3.1          codetools_0.2-20       
##  [40] DelayedArray_0.30.1     tidyselect_1.2.1        UCSC.utils_1.0.0       
##  [43] farver_2.1.2            viridis_0.6.5           spatstat.explore_3.2-7 
##  [46] jsonlite_1.8.8          progressr_0.14.0        ggridges_0.5.6         
##  [49] survival_3.6-4          systemfonts_1.1.0       tools_4.4.0            
##  [52] ragg_1.3.1              ica_1.0-3               Rcpp_1.0.12            
##  [55] glue_1.7.0              SparseArray_1.4.3       xfun_0.44              
##  [58] withr_3.0.0             fastmap_1.2.0           fansi_1.0.6            
##  [61] digest_0.6.35           timechange_0.3.0        R6_2.5.1               
##  [64] mime_0.12               textshaping_0.3.7       colorspace_2.1-0       
##  [67] scattermore_1.2         tensor_1.5              spatstat.data_3.0-4    
##  [70] utf8_1.2.4              generics_0.1.3          data.table_1.15.4      
##  [73] httr_1.4.7              htmlwidgets_1.6.4       S4Arrays_1.4.0         
##  [76] uwot_0.2.2              pkgconfig_2.0.3         gtable_0.3.5           
##  [79] lmtest_0.9-40           XVector_0.44.0          htmltools_0.5.8.1      
##  [82] carData_3.0-5           dotCall64_1.1-1         scales_1.3.0           
##  [85] png_0.1-8               knitr_1.46              rstudioapi_0.16.0      
##  [88] tzdb_0.4.0              nlme_3.1-164            cachem_1.0.8           
##  [91] zoo_1.8-12              KernSmooth_2.23-22      parallel_4.4.0         
##  [94] miniUI_0.1.1.1          pillar_1.9.0            grid_4.4.0             
##  [97] vctrs_0.6.5             RANN_2.6.1              promises_1.3.0         
## [100] car_3.1-2               xtable_1.8-4            cluster_2.1.6          
## [103] evaluate_0.23           cli_3.6.2               locfit_1.5-9.9         
## [106] compiler_4.4.0          rlang_1.1.3             crayon_1.5.2           
## [109] future.apply_1.11.2     ggsignif_0.6.4          labeling_0.4.3         
## [112] plyr_1.8.9              stringi_1.8.4           deldir_2.0-4           
## [115] BiocParallel_1.38.0     munsell_0.5.1           lazyeval_0.2.2         
## [118] spatstat.geom_3.2-9     RcppHNSW_0.6.0          hms_1.1.3              
## [121] future_1.33.2           shiny_1.8.1.1           highr_0.10             
## [124] ROCR_1.0-11             igraph_2.0.3            broom_1.0.5            
## [127] bslib_0.7.0